# WD
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")
# Load data
counts <- read_csv("TraC_Fish_Counts_2025-10-10.csv")
sites <- read_csv("TraC_Fish_Sites_2025-10-10.csv")
# Join count data with site information
counts_clean <- counts %>% left_join(sites, by = "SITE_ID") %>%
mutate(EVENT_DATE = ymd(EVENT_DATE),
EVENT_YEAR = year(EVENT_DATE),
EVENT_MONTH = month(EVENT_DATE),
SEASON = if_else(EVENT_MONTH %in% 4:9, "Summer", "Winter"))
counts_clean <- counts_clean %>%
select(
SITE_ID,
SITE_NAME = SITE_NAME.x,
SITE_PARENT_NAME,
SURVEY_ID,
EVENT_DATE,
SPECIES_NAME,
LATIN_NAME,
FISH_COUNT,
NO_OF_RUNS,
TOP_TIER_SITE,
SITE_RANKED_NGR,
SITE_RANKED_EASTING,
SITE_RANKED_NORTHING)
# Convert dates and extract components
counts_clean <- counts_clean %>%
mutate(
EVENT_DATE = ymd(EVENT_DATE),
EVENT_DATE_YEAR = year(EVENT_DATE),
EVENT_MONTH = month(EVENT_DATE),
EVENT_DAY = day(EVENT_DATE))
# Sampling method
counts_clean <- counts_clean %>%
mutate(
METHOD = case_when(
str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
TRUE ~ "Unknown"))
# Thames only
counts_thames <- counts_clean %>%
filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)))
# Lat and long
sites_sf <- st_as_sf(counts_thames, coords = c("SITE_RANKED_EASTING", "SITE_RANKED_NORTHING"), crs = 27700) %>%
st_transform(crs = 4326)
coords <- st_coordinates(sites_sf)
sites_sf$lon <- coords[, 1]
sites_sf$lat <- coords[, 2]
# Gears
sites_sf <- sites_sf %>%
mutate(
METHOD_TYPE = case_when(
str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
TRUE ~ "Unknown"))
##### Map #####
method_types <- unique(sites_sf$METHOD_TYPE)
method_pal <- colorFactor(palette = "Set2", domain = method_types)
leaflet(sites_sf) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(
lng = ~lon, lat = ~lat,
color = ~method_pal(METHOD_TYPE),
radius = 6,
stroke = TRUE, weight = 1, fillOpacity = 0.9,
popup = ~paste0(
"<b>Site:</b> ", SITE_NAME, "<br>",
"<b>Zone:</b> ", TOP_TIER_SITE, "<br>",
"<b>METHOD:</b> ", METHOD_TYPE), group = ~METHOD_TYPE) %>%
addLegend(
position = "bottomright",
pal = method_pal,
values = ~METHOD_TYPE,
title = "Method type",
opacity = 1) %>%
addLayersControl(
overlayGroups = method_types,
options = layersControlOptions(collapsed = FALSE),
position = "topright") %>%
addResetMapButton()
# RA
method_totals <- counts_thames %>%
filter(!is.na(METHOD)) %>%
group_by(METHOD) %>%
summarise(
Total_Individuals = sum(FISH_COUNT, na.rm = TRUE),
Total_Species = n_distinct(SPECIES_NAME),
.groups = "drop")
# Get grand total
grand_total <- sum(method_totals$Total_Individuals)
# Add relative abundance column
method_totals <- method_totals %>%
mutate(
Relative_Abundance = round(100 * Total_Individuals / grand_total, 1)
) %>%
arrange(desc(Relative_Abundance)) %>%
mutate(METHOD = factor(METHOD, levels = unique(METHOD)))
##### RA #####
ggplot(method_totals, aes(x = reorder(METHOD, -Relative_Abundance), y = Relative_Abundance)) +
geom_col(fill = "tomato") +
labs(
title = "Relative abundance by sampling method",
x = "Sampling method",
y = "Relative abundance (%)") +
theme_minimal()
# Matrix
method_species_table <- counts_thames %>%
filter(!is.na(METHOD), !is.na(SPECIES_NAME)) %>%
count(METHOD, SPECIES_NAME) %>%
pivot_wider(names_from = METHOD, values_from = n, values_fill = 0)
method_pa <- method_species_table %>%
mutate(across(-SPECIES_NAME, ~ as.integer(. > 0)))
method_cols <- names(method_pa)[!names(method_pa) %in% "SPECIES_NAME"]
method_unique <- method_pa %>%
rowwise() %>%
mutate(unique_to = if (sum(c_across(all_of(method_cols))) == 1) {
method_cols[which(c_across(all_of(method_cols)) == 1)]
} else {
NA_character_
}) %>%
ungroup() %>%
filter(!is.na(unique_to))
unique_counts <- method_unique %>%
count(unique_to, name = "unique_species_count") %>%
arrange(desc(unique_species_count))
ggplot(unique_counts, aes(x = reorder(unique_to, unique_species_count), y = unique_species_count, fill = unique_to)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(
title = "Number of species unique to each method type",
x = "Sampling method", y = "Unique species count") +
theme_minimal()
method_pa_long <- method_pa %>%
pivot_longer(cols = -SPECIES_NAME, names_to = "METHOD", values_to = "present")
ggplot(method_pa_long, aes(x = METHOD, y = SPECIES_NAME, fill = as.factor(present))) +
geom_tile(color = "white") +
scale_fill_manual(values = c("0" = "white", "1" = "steelblue")) +
labs(
title = "Species presence across sampling method",
x = "Sampling method", y = "Species",
fill = "Present") +
theme_minimal() +
theme(axis.text.y = element_text(size = 6))
Seine net data only.
# 1) Filter to Seine Net
counts_thames_filtered <- counts_thames %>%
filter(METHOD == "Seine Net")
# 2) Per-survey CPUE (fish per run)
# - If NO_OF_RUNS can vary within a survey, switch `first()` to `max()` and sanity-check.
seine_cpue <- counts_thames_filtered %>%
filter(!is.na(FISH_COUNT), NO_OF_RUNS > 0) %>%
group_by(SURVEY_ID, EVENT_DATE_YEAR) %>%
summarise(
total_fish = sum(FISH_COUNT, na.rm = TRUE),
runs = dplyr::first(NO_OF_RUNS), # or max(NO_OF_RUNS)
n_checks = n_distinct(NO_OF_RUNS), # for sanity checking
.groups = "drop"
) %>%
mutate(CPUE = total_fish / runs)
# Optional sanity check:
# stopifnot(all(seine_cpue$n_checks == 1))
# 3) Yearly, survey-balanced summary (each survey = 1 vote)
cpue_year <- seine_cpue %>%
group_by(EVENT_DATE_YEAR) %>%
summarise(
mean_CPUE = mean(CPUE, na.rm = TRUE),
sd_CPUE = sd(CPUE, na.rm = TRUE),
n_surveys = n(),
se = sd_CPUE / sqrt(n_surveys),
.groups = "drop"
)
# 4) Quick overview plot (mean ± SE)
p1 <- ggplot(cpue_year, aes(x = EVENT_DATE_YEAR, y = mean_CPUE)) +
geom_ribbon(aes(ymin = mean_CPUE - se, ymax = mean_CPUE + se),
alpha = 0.2, fill = "steelblue") +
geom_line(colour = "steelblue", linewidth = 1) +
geom_point(colour = "steelblue", size = 2) +
labs(
title = "Mean seine net CPUE per year – Thames Estuary (survey-balanced)",
x = "Year",
y = "Mean CPUE (fish per run)"
) +
theme_minimal()
# 5) Trend model on yearly means
lm1 <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = cpue_year)
cpue_year$pred <- predict(lm1)
cpue_year$pred_se <- predict(lm1, se.fit = TRUE)$se.fit
s <- summary(lm1)
slope <- coef(s)["EVENT_DATE_YEAR","Estimate"]
pval <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
lab <- sprintf("LM slope = %.2f fish/run/yr (p = %.3f)", slope, pval)
# 6) Trend plot with fitted mean and 95% CI of the fit
p2 <- ggplot(cpue_year, aes(EVENT_DATE_YEAR, mean_CPUE)) +
geom_point(size = 2) +
geom_line(col = "grey40") +
geom_ribbon(aes(ymin = pred - 1.96 * pred_se, ymax = pred + 1.96 * pred_se),
fill = "lightblue", alpha = 0.35) +
geom_line(aes(y = pred), col = "blue", linewidth = 1) +
labs(
title = "Trend in mean seine net CPUE – Thames Estuary (survey-balanced)",
x = "Year", y = "Mean CPUE (fish per run)"
) +
annotate("text",
x = min(cpue_year$EVENT_DATE_YEAR, na.rm = TRUE) + 1,
y = max(cpue_year$mean_CPUE, na.rm = TRUE),
hjust = 0, vjust = 1, size = 3.5, label = lab) +
theme_minimal()
p1
p2
# --- per-survey CPUE per species ---
species_svy_cpue <- counts_thames_filtered %>%
filter(NO_OF_RUNS > 0) %>%
group_by(SURVEY_ID, EVENT_DATE_YEAR, SPECIES_NAME) %>%
summarise(
count = sum(FISH_COUNT, na.rm = TRUE),
runs = dplyr::first(NO_OF_RUNS),
.groups = "drop"
) %>%
mutate(CPUE = count / runs)
# --- yearly, survey-balanced CPUE per species ---
species_trends <- species_svy_cpue %>%
group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
summarise(
mean_CPUE = mean(CPUE, na.rm = TRUE),
n_surveys = n(),
.groups = "drop"
)
# --- per-species trend (weights = n_surveys) ---
species_model_summary <- function(df) {
if (dplyr::n_distinct(df$EVENT_DATE_YEAR) < 3 || sd(df$mean_CPUE, na.rm = TRUE) == 0) {
return(tibble(Slope = NA_real_, CI_low = NA_real_, CI_high = NA_real_,
P_value = NA_real_, Significance = NA_character_))
}
m <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = df, weights = n_surveys)
s <- summary(m)
slope <- coef(s)["EVENT_DATE_YEAR","Estimate"]
se <- coef(s)["EVENT_DATE_YEAR","Std. Error"]
p <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
tibble(
Slope = slope,
CI_low = slope - 1.96 * se,
CI_high = slope + 1.96 * se,
P_value = p,
Significance = dplyr::case_when(
p <= 0.001 ~ "***",
p <= 0.01 ~ "**",
p <= 0.05 ~ "*",
TRUE ~ ""
)
)
}
species_results <- species_trends %>%
group_by(SPECIES_NAME) %>%
group_modify(~ species_model_summary(.x)) %>%
ungroup() %>%
mutate(
SigCat = factor(Significance,
levels = c("***","**","*",""),
labels = c("***","**","*","ns"))
)
species_results_sig <- species_results %>%
filter(P_value <= 0.05) %>%
mutate(
SigCat = factor(Significance,
levels = c("***","**","*"),
labels = c("***","**","*"))
)
# --- Define colour palette for significance ---
sig_cols <- c(
"***" = "#3b82f6", # blue
"**" = "#22c55e", # green
"*" = "#ef4444" # red
)
# --- Plot only significant species ---
ggplot(species_results_sig, aes(x = Slope, y = reorder(SPECIES_NAME, Slope))) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.3, color = "gray70") +
geom_point(aes(color = SigCat), size = 3) +
scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
breaks = c("***","**","*")) +
labs(
title = "Significant species-level trends in abundance (CPUE) – Thames Estuary",
subtitle = "Only species with p ≤ 0.05 shown. Slope ± 95% CI (weighted by number of surveys).",
x = "Slope (Δ CPUE per year)",
y = "Species",
caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
) +
theme_minimal() +
theme(
legend.position = "top",
legend.direction = "horizontal",
legend.title = element_text(size = 11),
legend.text = element_text(size = 10)
)
svy_comp <- counts_thames_filtered %>%
group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
summarise(n = sum(FISH_COUNT, na.rm = TRUE), .groups = "drop_last")
div_svy <- svy_comp %>%
summarise(
Species_Richness = n_distinct(SPECIES_NAME),
Total_Abundance = sum(n),
Shannon_Diversity = vegan::diversity(n, index = "shannon"),
Simpson_Diversity = vegan::diversity(n, index = "simpson"),
.groups = "drop"
) %>%
mutate(
Margalef_Richness = (Species_Richness - 1) / log(pmax(Total_Abundance, 1)),
Pielou_Evenness = Shannon_Diversity / log(pmax(Species_Richness, 2))
)
diversity_summary <- div_svy %>%
group_by(EVENT_DATE_YEAR) %>%
summarise(
Richness = mean(Species_Richness), se_Richness = sd(Species_Richness)/sqrt(n()),
Shannon = mean(Shannon_Diversity), se_Shannon = sd(Shannon_Diversity)/sqrt(n()),
Simpson = mean(Simpson_Diversity), se_Simpson = sd(Simpson_Diversity)/sqrt(n()),
Margalef = mean(Margalef_Richness), se_Margalef = sd(Margalef_Richness)/sqrt(n()),
Pielou = mean(Pielou_Evenness), se_Pielou = sd(Pielou_Evenness)/sqrt(n()),
n_surveys = n(),
.groups = "drop"
)
diversity_long <- diversity_summary %>%
select(EVENT_DATE_YEAR,
Richness, Shannon, Simpson, Margalef, Pielou) %>%
pivot_longer(cols = -EVENT_DATE_YEAR,
names_to = "Metric", values_to = "Value")
ggplot(diversity_long, aes(x = EVENT_DATE_YEAR, y = Value)) +
geom_line(color = "red", linewidth = 1) +
geom_smooth(method = "loess", se = TRUE, color = "black", linewidth = 1) +
facet_wrap(~ Metric, scales = "free_y") +
labs(
title = "Diversity metrics over time – Thames Estuary (survey-balanced)",
subtitle = "Per-survey diversity averaged per year (LOESS trend)",
x = "Year", y = "Diversity metric (mean)"
) +
theme_minimal()
lm_richness <- lm(Richness ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_shannon <- lm(Shannon ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_simpson <- lm(Simpson ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_margalef <- lm(Margalef ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_evenness <- lm(Pielou ~ EVENT_DATE_YEAR, data = diversity_summary)
summary_table <- function(model, metric_name) {
coef_summary <- summary(model)$coefficients
slope <- coef_summary["EVENT_DATE_YEAR", "Estimate"]
se <- coef_summary["EVENT_DATE_YEAR", "Std. Error"]
p <- coef_summary["EVENT_DATE_YEAR", intersect(c("Pr(>|t|)", "Pr(>|z|)"),
colnames(coef_summary))[1]]
ci_low <- slope - 1.96 * se
ci_high <- slope + 1.96 * se
sig <- case_when(p <= 0.001 ~ "***", p <= 0.01 ~ "**", p <= 0.05 ~ "*", TRUE ~ "")
data.frame(Metric = metric_name, Slope = slope, SE = se,
CI_low = ci_low, CI_high = ci_high,
P_value = round(p, 4), Significance = sig)
}
results <- bind_rows(
summary_table(lm_richness, "Species richness"),
summary_table(lm_shannon, "Shannon diversity"),
summary_table(lm_simpson, "Simpson diversity"),
summary_table(lm_margalef, "Margalef richness"),
summary_table(lm_evenness, "Pielou evenness")
)
results_lab <- results %>%
mutate(
SigCat = factor(Significance,
levels = c("***","**","*",""),
labels = c("***","**","*","ns")),
Metric_lab = dplyr::case_when(
Metric == "Species richness" ~ Metric, # never add stars here
Significance == "" ~ Metric, # ns → no stars
TRUE ~ paste0(Metric, " ", Significance)
)
)
sig_cols <- c(
"***" = "#3b82f6", # blue
"**" = "#22c55e", # green
"*" = "#ef4444", # red
"ns" = "#9ca3af" # grey
)
ggplot(results_lab, aes(x = Slope, y = reorder(Metric_lab, Slope))) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.22, color = "gray70") +
geom_point(aes(color = SigCat), size = 3) +
scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
breaks = c("***","**","*","ns")) +
labs(
title = "Trend in diversity metrics over time – Thames Estuary",
subtitle = "Slope ± 95% CI from linear models on yearly survey-balanced means",
x = "Slope (change per year)", y = "Diversity metric (mean)",
caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
) +
theme_minimal() +
theme(legend.position = "top",
legend.direction = "horizontal",
legend.title = element_text(size = 11),
legend.text = element_text(size = 10))
pa_svy <- counts_thames_filtered %>%
filter(!is.na(SPECIES_NAME), !is.na(FISH_COUNT)) %>%
mutate(PA = as.integer(FISH_COUNT > 0)) %>%
group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
summarise(present = as.integer(any(PA == 1)), .groups = "drop")
svy_per_year <- pa_svy %>%
distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
count(EVENT_DATE_YEAR, name = "n_surveys")
min_svy <- min(svy_per_year$n_surveys)
set.seed(42)
sampled_surveys <- pa_svy %>%
distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
group_by(EVENT_DATE_YEAR) %>%
slice_sample(n = min_svy) %>%
ungroup()
year_species_pa <- pa_svy %>%
semi_join(sampled_surveys, by = c("EVENT_DATE_YEAR","SURVEY_ID")) %>%
group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
summarise(present = as.integer(any(present == 1)), .groups = "drop") %>%
tidyr::pivot_wider(names_from = SPECIES_NAME, values_from = present, values_fill = 0) %>%
tibble::column_to_rownames("EVENT_DATE_YEAR") %>%
as.data.frame()
beta_core <- betapart::betapart.core(year_species_pa)
beta_res <- betapart::beta.pair(beta_core, index.family = "sor")
sor_mat <- as.matrix(beta_res$beta.sor)
sim_mat <- as.matrix(beta_res$beta.sim)
sne_mat <- as.matrix(beta_res$beta.sne)
years <- sort(as.numeric(rownames(sor_mat)))
beta_trends <- tibble::tibble(
Year1 = years[-length(years)],
Year2 = years[-1],
beta_sor = purrr::map2_dbl(Year1, Year2, ~ sor_mat[as.character(.x), as.character(.y)]),
beta_sim = purrr::map2_dbl(Year1, Year2, ~ sim_mat[as.character(.x), as.character(.y)]),
beta_sne = purrr::map2_dbl(Year1, Year2, ~ sne_mat[as.character(.x), as.character(.y)])
)
lm_sor <- lm(beta_sor ~ Year2, data = beta_trends)
lm_sim <- lm(beta_sim ~ Year2, data = beta_trends)
lm_sne <- lm(beta_sne ~ Year2, data = beta_trends)
get_lm_stats <- function(model) {
sm <- summary(model)
list(r2 = round(sm$r.squared, 3),
p = round(sm$coefficients[2, 4], 3))
}
sor_stats <- get_lm_stats(lm_sor)
sim_stats <- get_lm_stats(lm_sim)
sne_stats <- get_lm_stats(lm_sne)
beta_trends_long <- beta_trends %>%
tidyr::pivot_longer(c(beta_sor, beta_sim, beta_sne),
names_to = "Component", values_to = "Dissimilarity") %>%
dplyr::mutate(Component = dplyr::recode(Component,
beta_sor = "Sorensen", beta_sim = "Turnover", beta_sne = "Nestedness"))
subtitle_text <- paste0(
"Sorensen: R² = ", sor_stats$r2, ", p = ", sor_stats$p, " | ",
"Turnover: R² = ", sim_stats$r2, ", p = ", sim_stats$p, " | ",
"Nestedness: R² = ", sne_stats$r2, ", p = ", sne_stats$p)
ggplot(beta_trends_long, aes(x = Year2, y = Dissimilarity, color = Component)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_color_manual(values = c("Sorensen" = "darkorchid",
"Turnover" = "steelblue",
"Nestedness" = "darkorange")) +
labs(
title = "Year-to-year change in beta diversity components – Thames Estuary (survey-balanced)",
subtitle = subtitle_text,
x = "Year", y = "Dissimilarity", color = "Component"
) +
theme_minimal()
# --- Setup & load ---
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")
lengths <- read_csv("TraC_Fish_Individual_Lengths_2025-10-10.csv")
sites <- read_csv("TraC_Fish_Sites_2025-10-10.csv")
# Join + tidy
lengths_clean <- lengths %>%
left_join(sites, by = "SITE_ID") %>%
select(
SITE_ID,
SITE_NAME = SITE_NAME.x,
SITE_PARENT_NAME,
EVENT_DATE,
SURVEY_ID,
SPECIES_NAME,
LATIN_NAME,
FISH_LENGTH,
TOP_TIER_SITE,
SITE_RANKED_NGR,
SITE_RANKED_EASTING,
SITE_RANKED_NORTHING
) %>%
mutate(
EVENT_DATE = ymd(EVENT_DATE),
EVENT_YEAR = year(EVENT_DATE),
EVENT_MONTH = month(EVENT_DATE),
EVENT_DAY = day(EVENT_DATE),
METHOD = case_when(
str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
TRUE ~ "Unknown"
)
)
# Thames + Seine only
lengths_thames_filtered <- lengths_clean %>%
filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
METHOD == "Seine Net",
!is.na(FISH_LENGTH), FISH_LENGTH > 0)
# (Optional) Species-wise IQR outlier removal
lengths_thames_filtered <- lengths_thames_filtered %>%
group_by(LATIN_NAME) %>%
mutate(
Q1 = quantile(FISH_LENGTH, 0.25, na.rm = TRUE),
Q3 = quantile(FISH_LENGTH, 0.75, na.rm = TRUE),
IQRv = Q3 - Q1,
lo = Q1 - 1.5 * IQRv,
hi = Q3 + 1.5 * IQRv
) %>%
ungroup() %>%
filter(FISH_LENGTH >= lo, FISH_LENGTH <= hi)
# Per-survey summaries (one row per survey)
survey_means <- lengths_thames_filtered %>%
group_by(EVENT_YEAR, SURVEY_ID) %>%
summarise(
mean_len_survey = mean(FISH_LENGTH),
median_len_survey = median(FISH_LENGTH),
n_fish = n(),
.groups = "drop"
)
# Yearly (survey-balanced) summaries
survey_bal <- survey_means %>%
group_by(EVENT_YEAR) %>%
summarise(
mean_length = mean(mean_len_survey), # equal weight per survey
median_length = median(mean_len_survey),
sd_mean = sd(mean_len_survey),
n_surveys = n(),
se_mean = sd_mean / sqrt(n_surveys),
.groups = "drop"
) %>%
arrange(EVENT_YEAR)
# Linear trends on survey-balanced means
lm_mean <- lm(mean_length ~ EVENT_YEAR, data = survey_bal)
lm_median <- lm(median_length ~ EVENT_YEAR, data = survey_bal)
get_coef <- function(mod) {
s <- summary(mod)$coefficients
list(slope = unname(s["EVENT_YEAR","Estimate"]),
p = unname(s["EVENT_YEAR","Pr(>|t|)"]))
}
lab_mean <- with(get_coef(lm_mean), sprintf("Mean: slope = %.2f mm/yr (p = %.3g)", slope, p))
lab_median <- with(get_coef(lm_median), sprintf("Median: slope = %.2f mm/yr (p = %.3g)", slope, p))
# Plot with top annotations
x_left <- min(survey_bal$EVENT_YEAR, na.rm = TRUE)
ggplot(survey_bal, aes(EVENT_YEAR)) +
geom_ribbon(aes(ymin = mean_length - se_mean, ymax = mean_length + se_mean),
alpha = 0.15, fill = "steelblue") +
geom_line(aes(y = mean_length, color = "Mean"), linewidth = 1.2) +
geom_point(aes(y = mean_length, color = "Mean"), size = 2) +
geom_smooth(aes(y = mean_length, color = "Mean"), method = "lm", se = FALSE, linewidth = 0.9) +
geom_line(aes(y = median_length, color = "Median"), linewidth = 1.2, linetype = "dashed") +
geom_point(aes(y = median_length, color = "Median"), size = 2, shape = 17) +
geom_smooth(aes(y = median_length, color = "Median"), method = "lm", se = FALSE, linetype = "dashed", linewidth = 0.9) +
annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 1.4, label = lab_mean, color = "steelblue", size = 4) +
annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 2.8, label = lab_median, color = "darkorange", size = 4) +
scale_y_continuous(expand = expansion(mult = c(0.02, 0.18))) +
scale_color_manual(values = c("Mean" = "steelblue", "Median" = "darkorange")) +
labs(title = "Survey-balanced fish length trends – Thames Estuary",
subtitle = "Mean ± SE (solid) and median (dashed) per year with linear trends",
x = "Year", y = "Fish length (mm)", color = "Statistic") +
theme_minimal(base_size = 13) +
theme(legend.position = "top", panel.grid.minor = element_blank())
Seven most frequently sampled sites.
# WD
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")
# Load data
counts <- read_csv("TraC_Fish_Counts_2025-10-10.csv")
sites <- read_csv("TraC_Fish_Sites_2025-10-10.csv")
# Join count data with site information
counts_clean <- counts %>% left_join(sites, by = "SITE_ID") %>%
mutate(EVENT_DATE = ymd(EVENT_DATE),
EVENT_YEAR = year(EVENT_DATE),
EVENT_MONTH = month(EVENT_DATE),
SEASON = if_else(EVENT_MONTH %in% 4:9, "Summer", "Winter"))
counts_clean <- counts_clean %>%
select(
SITE_ID,
SITE_NAME = SITE_NAME.x,
SITE_PARENT_NAME,
SURVEY_ID,
EVENT_DATE,
SPECIES_NAME,
LATIN_NAME,
FISH_COUNT,
NO_OF_RUNS,
TOP_TIER_SITE,
SITE_RANKED_NGR,
SITE_RANKED_EASTING,
SITE_RANKED_NORTHING)
# Convert dates and extract components
counts_clean <- counts_clean %>%
mutate(
EVENT_DATE = ymd(EVENT_DATE),
EVENT_DATE_YEAR = year(EVENT_DATE),
EVENT_MONTH = month(EVENT_DATE),
EVENT_DAY = day(EVENT_DATE))
# Sampling method
counts_clean <- counts_clean %>%
mutate(
METHOD = case_when(
str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
TRUE ~ "Unknown"))
# Thames only
counts_thames <- counts_clean %>%
filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)))
counts_thames <- counts_thames %>%
filter(SITE_PARENT_NAME %in% c(
"Richmond",
"Kew",
"Battersea",
"Greenwich",
"West Thurrock",
"Denton Wharf",
"Stanford-le-Hope Beach"
))
# Lat and long
sites_sf <- st_as_sf(counts_thames, coords = c("SITE_RANKED_EASTING", "SITE_RANKED_NORTHING"), crs = 27700) %>%
st_transform(crs = 4326)
coords <- st_coordinates(sites_sf)
sites_sf$lon <- coords[, 1]
sites_sf$lat <- coords[, 2]
# Gears
sites_sf <- sites_sf %>%
mutate(
METHOD_TYPE = case_when(
str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
TRUE ~ "Unknown"))
##### Map #####
method_types <- unique(sites_sf$METHOD_TYPE)
method_pal <- colorFactor(palette = "Set2", domain = method_types)
leaflet(sites_sf) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(
lng = ~lon, lat = ~lat,
color = ~method_pal(METHOD_TYPE),
radius = 6,
stroke = TRUE, weight = 1, fillOpacity = 0.9,
popup = ~paste0(
"<b>Site:</b> ", SITE_NAME, "<br>",
"<b>Zone:</b> ", TOP_TIER_SITE, "<br>",
"<b>METHOD:</b> ", METHOD_TYPE), group = ~METHOD_TYPE) %>%
addLegend(
position = "bottomright",
pal = method_pal,
values = ~METHOD_TYPE,
title = "Method type",
opacity = 1) %>%
addLayersControl(
overlayGroups = method_types,
options = layersControlOptions(collapsed = FALSE),
position = "topright") %>%
addResetMapButton()
# RA
method_totals <- counts_thames %>%
filter(!is.na(METHOD)) %>%
group_by(METHOD) %>%
summarise(
Total_Individuals = sum(FISH_COUNT, na.rm = TRUE),
Total_Species = n_distinct(SPECIES_NAME),
.groups = "drop")
# Get grand total
grand_total <- sum(method_totals$Total_Individuals)
# Add relative abundance column
method_totals <- method_totals %>%
mutate(
Relative_Abundance = round(100 * Total_Individuals / grand_total, 1)
) %>%
arrange(desc(Relative_Abundance)) %>%
mutate(METHOD = factor(METHOD, levels = unique(METHOD)))
##### RA #####
ggplot(method_totals, aes(x = reorder(METHOD, -Relative_Abundance), y = Relative_Abundance)) +
geom_col(fill = "tomato") +
labs(
title = "Relative abundance by sampling method",
x = "Sampling method",
y = "Relative abundance (%)") +
theme_minimal()
# Matrix
method_species_table <- counts_thames %>%
filter(!is.na(METHOD), !is.na(SPECIES_NAME)) %>%
count(METHOD, SPECIES_NAME) %>%
pivot_wider(names_from = METHOD, values_from = n, values_fill = 0)
method_pa <- method_species_table %>%
mutate(across(-SPECIES_NAME, ~ as.integer(. > 0)))
method_cols <- names(method_pa)[!names(method_pa) %in% "SPECIES_NAME"]
method_unique <- method_pa %>%
rowwise() %>%
mutate(unique_to = if (sum(c_across(all_of(method_cols))) == 1) {
method_cols[which(c_across(all_of(method_cols)) == 1)]
} else {
NA_character_
}) %>%
ungroup() %>%
filter(!is.na(unique_to))
unique_counts <- method_unique %>%
count(unique_to, name = "unique_species_count") %>%
arrange(desc(unique_species_count))
ggplot(unique_counts, aes(x = reorder(unique_to, unique_species_count), y = unique_species_count, fill = unique_to)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(
title = "Number of species unique to each method type",
x = "Sampling method", y = "Unique species count") +
theme_minimal()
method_pa_long <- method_pa %>%
pivot_longer(cols = -SPECIES_NAME, names_to = "METHOD", values_to = "present")
ggplot(method_pa_long, aes(x = METHOD, y = SPECIES_NAME, fill = as.factor(present))) +
geom_tile(color = "white") +
scale_fill_manual(values = c("0" = "white", "1" = "steelblue")) +
labs(
title = "Species presence across sampling method",
x = "Sampling method", y = "Species",
fill = "Present") +
theme_minimal() +
theme(axis.text.y = element_text(size = 6))
# 1) Filter to Seine Net
counts_thames_filtered <- counts_thames %>%
filter(METHOD == "Seine Net")
# 2) Per-survey CPUE (fish per run)
# - If NO_OF_RUNS can vary within a survey, switch `first()` to `max()` and sanity-check.
seine_cpue <- counts_thames_filtered %>%
filter(!is.na(FISH_COUNT), NO_OF_RUNS > 0) %>%
group_by(SURVEY_ID, EVENT_DATE_YEAR) %>%
summarise(
total_fish = sum(FISH_COUNT, na.rm = TRUE),
runs = dplyr::first(NO_OF_RUNS), # or max(NO_OF_RUNS)
n_checks = n_distinct(NO_OF_RUNS), # for sanity checking
.groups = "drop"
) %>%
mutate(CPUE = total_fish / runs)
# Optional sanity check:
# stopifnot(all(seine_cpue$n_checks == 1))
# 3) Yearly, survey-balanced summary (each survey = 1 vote)
cpue_year <- seine_cpue %>%
group_by(EVENT_DATE_YEAR) %>%
summarise(
mean_CPUE = mean(CPUE, na.rm = TRUE),
sd_CPUE = sd(CPUE, na.rm = TRUE),
n_surveys = n(),
se = sd_CPUE / sqrt(n_surveys),
.groups = "drop"
)
# 4) Quick overview plot (mean ± SE)
p1 <- ggplot(cpue_year, aes(x = EVENT_DATE_YEAR, y = mean_CPUE)) +
geom_ribbon(aes(ymin = mean_CPUE - se, ymax = mean_CPUE + se),
alpha = 0.2, fill = "steelblue") +
geom_line(colour = "steelblue", linewidth = 1) +
geom_point(colour = "steelblue", size = 2) +
labs(
title = "Mean seine net CPUE per year – Thames Estuary (survey-balanced)",
x = "Year",
y = "Mean CPUE (fish per run)"
) +
theme_minimal()
# 5) Trend model on yearly means
lm1 <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = cpue_year)
cpue_year$pred <- predict(lm1)
cpue_year$pred_se <- predict(lm1, se.fit = TRUE)$se.fit
s <- summary(lm1)
slope <- coef(s)["EVENT_DATE_YEAR","Estimate"]
pval <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
lab <- sprintf("LM slope = %.2f fish/run/yr (p = %.3f)", slope, pval)
# 6) Trend plot with fitted mean and 95% CI of the fit
p2 <- ggplot(cpue_year, aes(EVENT_DATE_YEAR, mean_CPUE)) +
geom_point(size = 2) +
geom_line(col = "grey40") +
geom_ribbon(aes(ymin = pred - 1.96 * pred_se, ymax = pred + 1.96 * pred_se),
fill = "lightblue", alpha = 0.35) +
geom_line(aes(y = pred), col = "blue", linewidth = 1) +
labs(
title = "Trend in mean seine net CPUE – Thames Estuary (survey-balanced)",
x = "Year", y = "Mean CPUE (fish per run)"
) +
annotate("text",
x = min(cpue_year$EVENT_DATE_YEAR, na.rm = TRUE) + 1,
y = max(cpue_year$mean_CPUE, na.rm = TRUE),
hjust = 0, vjust = 1, size = 3.5, label = lab) +
theme_minimal()
p1
p2
# --- per-survey CPUE per species ---
species_svy_cpue <- counts_thames_filtered %>%
filter(NO_OF_RUNS > 0) %>%
group_by(SURVEY_ID, EVENT_DATE_YEAR, SPECIES_NAME) %>%
summarise(
count = sum(FISH_COUNT, na.rm = TRUE),
runs = dplyr::first(NO_OF_RUNS),
.groups = "drop"
) %>%
mutate(CPUE = count / runs)
# --- yearly, survey-balanced CPUE per species ---
species_trends <- species_svy_cpue %>%
group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
summarise(
mean_CPUE = mean(CPUE, na.rm = TRUE),
n_surveys = n(),
.groups = "drop"
)
# --- per-species trend (weights = n_surveys) ---
species_model_summary <- function(df) {
if (dplyr::n_distinct(df$EVENT_DATE_YEAR) < 3 || sd(df$mean_CPUE, na.rm = TRUE) == 0) {
return(tibble(Slope = NA_real_, CI_low = NA_real_, CI_high = NA_real_,
P_value = NA_real_, Significance = NA_character_))
}
m <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = df, weights = n_surveys)
s <- summary(m)
slope <- coef(s)["EVENT_DATE_YEAR","Estimate"]
se <- coef(s)["EVENT_DATE_YEAR","Std. Error"]
p <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
tibble(
Slope = slope,
CI_low = slope - 1.96 * se,
CI_high = slope + 1.96 * se,
P_value = p,
Significance = dplyr::case_when(
p <= 0.001 ~ "***",
p <= 0.01 ~ "**",
p <= 0.05 ~ "*",
TRUE ~ ""
)
)
}
species_results <- species_trends %>%
group_by(SPECIES_NAME) %>%
group_modify(~ species_model_summary(.x)) %>%
ungroup() %>%
mutate(
SigCat = factor(Significance,
levels = c("***","**","*",""),
labels = c("***","**","*","ns"))
)
species_results_sig <- species_results %>%
filter(P_value <= 0.05) %>%
mutate(
SigCat = factor(Significance,
levels = c("***","**","*"),
labels = c("***","**","*"))
)
# --- Define colour palette for significance ---
sig_cols <- c(
"***" = "#3b82f6", # blue
"**" = "#22c55e", # green
"*" = "#ef4444" # red
)
# --- Plot only significant species ---
ggplot(species_results_sig, aes(x = Slope, y = reorder(SPECIES_NAME, Slope))) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.3, color = "gray70") +
geom_point(aes(color = SigCat), size = 3) +
scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
breaks = c("***","**","*")) +
labs(
title = "Significant species-level trends in abundance (CPUE) – Thames Estuary",
subtitle = "Only species with p ≤ 0.05 shown. Slope ± 95% CI (weighted by number of surveys).",
x = "Slope (Δ CPUE per year)",
y = "Species",
caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
) +
theme_minimal() +
theme(
legend.position = "top",
legend.direction = "horizontal",
legend.title = element_text(size = 11),
legend.text = element_text(size = 10)
)
svy_comp <- counts_thames_filtered %>%
group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
summarise(n = sum(FISH_COUNT, na.rm = TRUE), .groups = "drop_last")
div_svy <- svy_comp %>%
summarise(
Species_Richness = n_distinct(SPECIES_NAME),
Total_Abundance = sum(n),
Shannon_Diversity = vegan::diversity(n, index = "shannon"),
Simpson_Diversity = vegan::diversity(n, index = "simpson"),
.groups = "drop"
) %>%
mutate(
Margalef_Richness = (Species_Richness - 1) / log(pmax(Total_Abundance, 1)),
Pielou_Evenness = Shannon_Diversity / log(pmax(Species_Richness, 2))
)
diversity_summary <- div_svy %>%
group_by(EVENT_DATE_YEAR) %>%
summarise(
Richness = mean(Species_Richness), se_Richness = sd(Species_Richness)/sqrt(n()),
Shannon = mean(Shannon_Diversity), se_Shannon = sd(Shannon_Diversity)/sqrt(n()),
Simpson = mean(Simpson_Diversity), se_Simpson = sd(Simpson_Diversity)/sqrt(n()),
Margalef = mean(Margalef_Richness), se_Margalef = sd(Margalef_Richness)/sqrt(n()),
Pielou = mean(Pielou_Evenness), se_Pielou = sd(Pielou_Evenness)/sqrt(n()),
n_surveys = n(),
.groups = "drop"
)
diversity_long <- diversity_summary %>%
select(EVENT_DATE_YEAR,
Richness, Shannon, Simpson, Margalef, Pielou) %>%
pivot_longer(cols = -EVENT_DATE_YEAR,
names_to = "Metric", values_to = "Value")
ggplot(diversity_long, aes(x = EVENT_DATE_YEAR, y = Value)) +
geom_line(color = "red", linewidth = 1) +
geom_smooth(method = "loess", se = TRUE, color = "black", linewidth = 1) +
facet_wrap(~ Metric, scales = "free_y") +
labs(
title = "Diversity metrics over time – Thames Estuary (survey-balanced)",
subtitle = "Per-survey diversity averaged per year (LOESS trend)",
x = "Year", y = "Diversity metric (mean)"
) +
theme_minimal()
lm_richness <- lm(Richness ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_shannon <- lm(Shannon ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_simpson <- lm(Simpson ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_margalef <- lm(Margalef ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_evenness <- lm(Pielou ~ EVENT_DATE_YEAR, data = diversity_summary)
summary_table <- function(model, metric_name) {
coef_summary <- summary(model)$coefficients
slope <- coef_summary["EVENT_DATE_YEAR", "Estimate"]
se <- coef_summary["EVENT_DATE_YEAR", "Std. Error"]
p <- coef_summary["EVENT_DATE_YEAR", intersect(c("Pr(>|t|)", "Pr(>|z|)"),
colnames(coef_summary))[1]]
ci_low <- slope - 1.96 * se
ci_high <- slope + 1.96 * se
sig <- case_when(p <= 0.001 ~ "***", p <= 0.01 ~ "**", p <= 0.05 ~ "*", TRUE ~ "")
data.frame(Metric = metric_name, Slope = slope, SE = se,
CI_low = ci_low, CI_high = ci_high,
P_value = round(p, 4), Significance = sig)
}
results <- bind_rows(
summary_table(lm_richness, "Species richness"),
summary_table(lm_shannon, "Shannon diversity"),
summary_table(lm_simpson, "Simpson diversity"),
summary_table(lm_margalef, "Margalef richness"),
summary_table(lm_evenness, "Pielou evenness")
)
results_lab <- results %>%
mutate(
SigCat = factor(Significance,
levels = c("***","**","*",""),
labels = c("***","**","*","ns")),
Metric_lab = Metric # <- keep metric names clean, no stars
)
sig_cols <- c(
"***" = "#3b82f6", # blue
"**" = "#22c55e", # green
"*" = "#ef4444", # red
"ns" = "#9ca3af" # grey
)
ggplot(results_lab, aes(x = Slope, y = reorder(Metric_lab, Slope))) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.22, color = "gray70") +
geom_point(aes(color = SigCat), size = 3) +
scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
breaks = c("***","**","*","ns")) +
labs(
title = "Trend in diversity metrics over time – Thames Estuary",
subtitle = "Slope ± 95% CI from linear models on yearly survey-balanced means",
x = "Slope (change per year)", y = "Diversity metric (mean)",
caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
) +
theme_minimal() +
theme(legend.position = "top",
legend.direction = "horizontal",
legend.title = element_text(size = 11),
legend.text = element_text(size = 10))
pa_svy <- counts_thames_filtered %>%
filter(!is.na(SPECIES_NAME), !is.na(FISH_COUNT)) %>%
mutate(PA = as.integer(FISH_COUNT > 0)) %>%
group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
summarise(present = as.integer(any(PA == 1)), .groups = "drop")
svy_per_year <- pa_svy %>%
distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
count(EVENT_DATE_YEAR, name = "n_surveys")
min_svy <- min(svy_per_year$n_surveys)
set.seed(42)
sampled_surveys <- pa_svy %>%
distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
group_by(EVENT_DATE_YEAR) %>%
slice_sample(n = min_svy) %>%
ungroup()
year_species_pa <- pa_svy %>%
semi_join(sampled_surveys, by = c("EVENT_DATE_YEAR","SURVEY_ID")) %>%
group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
summarise(present = as.integer(any(present == 1)), .groups = "drop") %>%
tidyr::pivot_wider(names_from = SPECIES_NAME, values_from = present, values_fill = 0) %>%
tibble::column_to_rownames("EVENT_DATE_YEAR") %>%
as.data.frame()
beta_core <- betapart::betapart.core(year_species_pa)
beta_res <- betapart::beta.pair(beta_core, index.family = "sor")
sor_mat <- as.matrix(beta_res$beta.sor)
sim_mat <- as.matrix(beta_res$beta.sim)
sne_mat <- as.matrix(beta_res$beta.sne)
years <- sort(as.numeric(rownames(sor_mat)))
beta_trends <- tibble::tibble(
Year1 = years[-length(years)],
Year2 = years[-1],
beta_sor = purrr::map2_dbl(Year1, Year2, ~ sor_mat[as.character(.x), as.character(.y)]),
beta_sim = purrr::map2_dbl(Year1, Year2, ~ sim_mat[as.character(.x), as.character(.y)]),
beta_sne = purrr::map2_dbl(Year1, Year2, ~ sne_mat[as.character(.x), as.character(.y)])
)
lm_sor <- lm(beta_sor ~ Year2, data = beta_trends)
lm_sim <- lm(beta_sim ~ Year2, data = beta_trends)
lm_sne <- lm(beta_sne ~ Year2, data = beta_trends)
get_lm_stats <- function(model) {
sm <- summary(model)
list(r2 = round(sm$r.squared, 3),
p = round(sm$coefficients[2, 4], 3))
}
sor_stats <- get_lm_stats(lm_sor)
sim_stats <- get_lm_stats(lm_sim)
sne_stats <- get_lm_stats(lm_sne)
beta_trends_long <- beta_trends %>%
tidyr::pivot_longer(c(beta_sor, beta_sim, beta_sne),
names_to = "Component", values_to = "Dissimilarity") %>%
dplyr::mutate(Component = dplyr::recode(Component,
beta_sor = "Sorensen", beta_sim = "Turnover", beta_sne = "Nestedness"))
subtitle_text <- paste0(
"Sorensen: R² = ", sor_stats$r2, ", p = ", sor_stats$p, " | ",
"Turnover: R² = ", sim_stats$r2, ", p = ", sim_stats$p, " | ",
"Nestedness: R² = ", sne_stats$r2, ", p = ", sne_stats$p)
ggplot(beta_trends_long, aes(x = Year2, y = Dissimilarity, color = Component)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_color_manual(values = c("Sorensen" = "darkorchid",
"Turnover" = "steelblue",
"Nestedness" = "darkorange")) +
labs(
title = "Year-to-year change in beta diversity components – Thames Estuary (survey-balanced)",
subtitle = subtitle_text,
x = "Year", y = "Dissimilarity", color = "Component"
) +
theme_minimal()
# --- Setup & load ---
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")
lengths <- read_csv("TraC_Fish_Individual_Lengths_2025-10-10.csv")
sites <- read_csv("TraC_Fish_Sites_2025-10-10.csv")
# Join + tidy
lengths_clean <- lengths %>%
left_join(sites, by = "SITE_ID") %>%
select(
SITE_ID,
SITE_NAME = SITE_NAME.x,
SITE_PARENT_NAME,
EVENT_DATE,
SURVEY_ID,
SPECIES_NAME,
LATIN_NAME,
FISH_LENGTH,
TOP_TIER_SITE,
SITE_RANKED_NGR,
SITE_RANKED_EASTING,
SITE_RANKED_NORTHING
) %>%
mutate(
EVENT_DATE = ymd(EVENT_DATE),
EVENT_YEAR = year(EVENT_DATE),
EVENT_MONTH = month(EVENT_DATE),
EVENT_DAY = day(EVENT_DATE),
METHOD = case_when(
str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
TRUE ~ "Unknown"
)
)
# Thames + Seine only
lengths_thames_filtered <- lengths_clean %>%
filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
METHOD == "Seine Net",
!is.na(FISH_LENGTH), FISH_LENGTH > 0)
lengths_thames_filtered2 <- lengths_thames_filtered %>%
filter(SITE_PARENT_NAME %in% c(
"Richmond",
"Kew",
"Battersea",
"Greenwich",
"West Thurrock",
"Denton Wharf",
"Stanford-le-Hope Beach"
))
# (Optional) Species-wise IQR outlier removal
lengths_thames_filtered <- lengths_thames_filtered2 %>%
group_by(LATIN_NAME) %>%
mutate(
Q1 = quantile(FISH_LENGTH, 0.25, na.rm = TRUE),
Q3 = quantile(FISH_LENGTH, 0.75, na.rm = TRUE),
IQRv = Q3 - Q1,
lo = Q1 - 1.5 * IQRv,
hi = Q3 + 1.5 * IQRv
) %>%
ungroup() %>%
filter(FISH_LENGTH >= lo, FISH_LENGTH <= hi)
# Per-survey summaries (one row per survey)
survey_means <- lengths_thames_filtered %>%
group_by(EVENT_YEAR, SURVEY_ID) %>%
summarise(
mean_len_survey = mean(FISH_LENGTH),
median_len_survey = median(FISH_LENGTH),
n_fish = n(),
.groups = "drop"
)
# Yearly (survey-balanced) summaries
survey_bal <- survey_means %>%
group_by(EVENT_YEAR) %>%
summarise(
mean_length = mean(mean_len_survey), # equal weight per survey
median_length = median(mean_len_survey),
sd_mean = sd(mean_len_survey),
n_surveys = n(),
se_mean = sd_mean / sqrt(n_surveys),
.groups = "drop"
) %>%
arrange(EVENT_YEAR)
# Linear trends on survey-balanced means
lm_mean <- lm(mean_length ~ EVENT_YEAR, data = survey_bal)
lm_median <- lm(median_length ~ EVENT_YEAR, data = survey_bal)
get_coef <- function(mod) {
s <- summary(mod)$coefficients
list(slope = unname(s["EVENT_YEAR","Estimate"]),
p = unname(s["EVENT_YEAR","Pr(>|t|)"]))
}
lab_mean <- with(get_coef(lm_mean), sprintf("Mean: slope = %.2f mm/yr (p = %.3g)", slope, p))
lab_median <- with(get_coef(lm_median), sprintf("Median: slope = %.2f mm/yr (p = %.3g)", slope, p))
# Plot with top annotations
x_left <- min(survey_bal$EVENT_YEAR, na.rm = TRUE)
ggplot(survey_bal, aes(EVENT_YEAR)) +
geom_ribbon(aes(ymin = mean_length - se_mean, ymax = mean_length + se_mean),
alpha = 0.15, fill = "steelblue") +
geom_line(aes(y = mean_length, color = "Mean"), linewidth = 1.2) +
geom_point(aes(y = mean_length, color = "Mean"), size = 2) +
geom_smooth(aes(y = mean_length, color = "Mean"), method = "lm", se = FALSE, linewidth = 0.9) +
geom_line(aes(y = median_length, color = "Median"), linewidth = 1.2, linetype = "dashed") +
geom_point(aes(y = median_length, color = "Median"), size = 2, shape = 17) +
geom_smooth(aes(y = median_length, color = "Median"), method = "lm", se = FALSE, linetype = "dashed", linewidth = 0.9) +
annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 1.4, label = lab_mean, color = "steelblue", size = 4) +
annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 2.8, label = lab_median, color = "darkorange", size = 4) +
scale_y_continuous(expand = expansion(mult = c(0.02, 0.18))) +
scale_color_manual(values = c("Mean" = "steelblue", "Median" = "darkorange")) +
labs(title = "Survey-balanced fish length trends – Thames Estuary",
subtitle = "Mean ± SE (solid) and median (dashed) per year with linear trends",
x = "Year", y = "Fish length (mm)", color = "Statistic") +
theme_minimal(base_size = 13) +
theme(legend.position = "top", panel.grid.minor = element_blank())
# Thames + Seine only
lengths_thames_filtered <- lengths_clean %>%
filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
METHOD == "Seine Net",
!is.na(FISH_LENGTH), FISH_LENGTH > 0)
lengths_thames_filtered2 <- lengths_thames_filtered %>%
filter(SITE_PARENT_NAME %in% c(
"Richmond",
"Kew",
"Battersea",
"Greenwich",
"West Thurrock",
"Denton Wharf",
"Stanford-le-Hope Beach"
))
len_dat <- lengths_thames_filtered2 %>%
mutate(LATIN_NAME = str_squish(LATIN_NAME)) %>%
filter(!is.na(FISH_LENGTH), FISH_LENGTH > 0,
!is.na(EVENT_YEAR), !is.na(SITE_ID)) %>%
mutate(Yc = EVENT_YEAR - mean(EVENT_YEAR, na.rm = TRUE))
# keep species with enough temporal info overall
keep <- len_dat %>%
group_by(LATIN_NAME) %>%
summarise(n_years = n_distinct(EVENT_YEAR), n = n(), .groups = "drop") %>%
filter(n_years >= 3, n >= 30) %>%
pull(LATIN_NAME)
dat_keep <- len_dat %>% filter(LATIN_NAME %in% keep)
fit_one <- function(df) {
n_obs <- nrow(df)
n_sites <- dplyr::n_distinct(df$SITE_ID)
max_rep <- if (n_obs == 0) 0 else df %>% count(SITE_ID, name = "n") %>% pull(n) %>% max(na.rm = TRUE)
use_lmm <- (max_rep >= 2) && (n_sites < n_obs)
if (use_lmm) {
fit <- try(suppressWarnings(lmer(FISH_LENGTH ~ Yc + (1|SITE_ID), data = df, REML = TRUE)), silent = TRUE)
if (inherits(fit, "try-error")) return(NULL)
s <- summary(fit)
co <- coef(s)
# extract slope/SE
beta <- unname(co["Yc","Estimate"])
se <- unname(co["Yc","Std. Error"])
# try to get p directly; otherwise compute from t as normal approx
pcol <- intersect(colnames(co), c("Pr(>|t|)", "Pr(>|z|)"))
if (length(pcol) == 1) {
pval <- unname(co["Yc", pcol])
} else {
tcol <- intersect(colnames(co), c("t value","z value","t.value","z.value"))
tval <- unname(co["Yc", tcol])
# Normal approx fallback (conservative would use Satterthwaite via lmerTest)
pval <- 2 * pnorm(abs(tval), lower.tail = FALSE)
}
r2_m <- r2_c <- NA_real_
r2 <- try(suppressWarnings(suppressMessages(performance::r2_nakagawa(fit))), silent = TRUE)
if (!inherits(r2, "try-error")) { r2_m <- r2$R2_marginal; r2_c <- r2$R2_conditional }
return(tibble(
model = "LMM (1|SITE)",
Slope = beta,
SE = se,
p = pval,
r2_m = r2_m,
r2_c = r2_c,
n_obs = n_obs, n_sites = n_sites, max_rep = max_rep
))
}
# LM fallback
fit <- try(suppressWarnings(lm(FISH_LENGTH ~ Yc, data = df)), silent = TRUE)
if (inherits(fit, "try-error")) return(NULL)
s <- summary(fit); co <- coef(s)
tibble(
model = "LM",
Slope = unname(co["Yc","Estimate"]),
SE = unname(co["Yc","Std. Error"]),
p = unname(co["Yc","Pr(>|t|)"]),
r2_m = s$r.squared,
r2_c = NA_real_,
n_obs = nrow(df),
n_sites = dplyr::n_distinct(df$SITE_ID),
max_rep = if (nrow(df) > 0) df %>% count(SITE_ID, name = "n") %>% pull(n) %>% max(na.rm = TRUE) else NA_real_
)
}
safe_fit_one <- function(df) {
out <- try(fit_one(df), silent = TRUE)
if (inherits(out, "try-error") || is.null(out)) {
tibble(
model = NA_character_,
Slope = NA_real_, SE = NA_real_, p = NA_real_,
r2_m = NA_real_, r2_c = NA_real_,
n_obs = nrow(df),
n_sites = dplyr::n_distinct(df$SITE_ID),
max_rep = if (nrow(df) > 0) df %>% count(SITE_ID, name="n") %>% pull(n) %>% max(na.rm=TRUE) else NA_real_
)
} else out
}
species_size_trends <- dat_keep %>%
group_by(LATIN_NAME, SPECIES_NAME) %>%
group_split(.keep = TRUE) %>%
purrr::map_dfr(function(df) {
safe_fit_one(df) %>%
mutate(
LATIN_NAME = df$LATIN_NAME[1],
SPECIES_NAME = df$SPECIES_NAME[1]
)
}) %>%
relocate(LATIN_NAME, SPECIES_NAME) %>%
ungroup() %>%
mutate(
Sig = dplyr::case_when(
!is.na(p) & p <= 0.001 ~ "***",
!is.na(p) & p <= 0.01 ~ "**",
!is.na(p) & p <= 0.05 ~ "*",
TRUE ~ ""
)
)
# Significant only, by salinity class
sig_trends <- species_size_trends %>% filter(p <= 0.05)
# --- Keep only significant species (p ≤ 0.05)
sig_trends <- species_size_trends %>%
filter(p <= 0.05) %>%
mutate(
SigCat = factor(Sig,
levels = c("***", "**", "*"),
labels = c("***", "**", "*"))
)
# --- Define the same significance colour palette ---
sig_cols <- c(
"***" = "#3b82f6", # blue
"**" = "#22c55e", # green
"*" = "#ef4444" # red
)
# --- Plot: consistent significance colours ---
ggplot(sig_trends, aes(x = Slope, y = reorder(SPECIES_NAME, Slope))) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
geom_errorbarh(aes(xmin = Slope - SE, xmax = Slope + SE),
height = 0.25, color = "gray60") +
geom_point(aes(color = SigCat, shape = model), size = 3) +
scale_color_manual(values = sig_cols, name = "Significance",
breaks = c("***", "**", "*")) +
scale_shape_manual(values = c("LMM (1|SITE)" = 16, "LM" = 1),
name = "Model") +
labs(
title = "Significant species body-size trends – Thames Estuary",
subtitle = "Slope ± SE (mm/year). LMM used when sites have replication; else LM fallback.",
x = "Change in mean length (mm per year)",
y = "Species",
caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "top",
legend.box = "horizontal",
legend.title = element_text(size = 11),
legend.text = element_text(size = 10)
)
dat_len <- lengths_clean %>%
filter(
str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
METHOD == "Seine Net",
!is.na(FISH_LENGTH), FISH_LENGTH > 0,
!is.na(EVENT_YEAR), !is.na(SURVEY_ID),
!is.na(SITE_PARENT_NAME), !is.na(SPECIES_NAME)
)
dat_len2 <- dat_len %>%
filter(SITE_PARENT_NAME %in% c(
"Richmond",
"Kew",
"Battersea",
"Greenwich",
"West Thurrock",
"Denton Wharf",
"Stanford-le-Hope Beach"
))
thr <- dat_len2 %>%
group_by(SPECIES_NAME) %>%
summarise(
n_len = n(),
L_juv = quantile(FISH_LENGTH, 0.25, na.rm = TRUE),
L_adlt = quantile(FISH_LENGTH, 0.75, na.rm = TRUE),
.groups = "drop"
) %>% filter(n_len >= 30)
flagged <- dat_len %>%
inner_join(thr, by = "SPECIES_NAME") %>%
mutate(
is_juv = FISH_LENGTH <= L_juv,
is_adult = FISH_LENGTH >= L_adlt
)
sb_svy <- flagged %>%
group_by(SURVEY_ID, EVENT_YEAR) %>%
summarise(
prop_juv = mean(is_juv, na.rm = TRUE),
prop_adult = mean(is_adult, na.rm = TRUE),
.groups = "drop"
)
sb_year <- sb_svy %>%
group_by(EVENT_YEAR) %>%
summarise(
mean_prop_juv = mean(prop_juv),
se_prop_juv = sd(prop_juv)/sqrt(n()),
mean_prop_adult = mean(prop_adult),
se_prop_adult = sd(prop_adult)/sqrt(n()),
.groups = "drop"
)
sb_long <- sb_year %>%
transmute(
EVENT_YEAR,
Juveniles_mean = mean_prop_juv, Juveniles_se = se_prop_juv,
Adults_mean = mean_prop_adult, Adults_se = se_prop_adult
) %>%
pivot_longer(-EVENT_YEAR, names_to = c("Group",".value"), names_sep = "_") %>%
mutate(
lower = pmax(0, mean - 1.96*se),
upper = pmin(1, mean + 1.96*se),
Source = "Survey-balanced"
)
m_juv <- glmer(
as.integer(is_juv) ~ scale(EVENT_YEAR) + (1|SITE_PARENT_NAME) + (1|SPECIES_NAME),
data = flagged, family = binomial()
)
m_ad <- glmer(
as.integer(is_adult) ~ scale(EVENT_YEAR) + (1|SITE_PARENT_NAME) + (1|SPECIES_NAME),
data = flagged, family = binomial()
)
yrs <- sort(unique(flagged$EVENT_YEAR))
emm_juv <- as.data.frame(emmeans(m_juv, ~ EVENT_YEAR, at = list(EVENT_YEAR = yrs), type = "response"))
emm_ad <- as.data.frame(emmeans(m_ad, ~ EVENT_YEAR, at = list(EVENT_YEAR = yrs), type = "response"))
prob_col <- if ("prob" %in% names(emm_juv)) "prob" else "emmean"
extract_emm <- function(emm_df, group_label) {
nm <- names(emm_df)
# estimate column can be "prob", "emmean", or "response"
est_col <- intersect(c("prob", "emmean", "response"), nm)
if (length(est_col) == 0)
stop("No estimate column found in emmeans output. Available cols: ",
paste(nm, collapse = ", "))
# CI columns can be "asymp.LCL/UCL" or "lower.CL/upper.CL"
lower_col <- intersect(c("asymp.LCL", "lower.CL", "LCL"), nm)
upper_col <- intersect(c("asymp.UCL", "upper.CL", "UCL"), nm)
if (length(lower_col) == 0 || length(upper_col) == 0)
stop("No CI columns found in emmeans output. Available cols: ",
paste(nm, collapse = ", "))
# Year column should be present as EVENT_YEAR (factor or numeric)
year_col <- "EVENT_YEAR"
if (!year_col %in% nm)
stop("EVENT_YEAR not found in emmeans output. Available cols: ",
paste(nm, collapse = ", "))
tibble::tibble(
EVENT_YEAR = as.integer(emm_df[[year_col]]),
mean = as.numeric(emm_df[[est_col[1]]]),
lower = as.numeric(emm_df[[lower_col[1]]]),
upper = as.numeric(emm_df[[upper_col[1]]]),
Group = group_label,
Source = "LS-mean (GLMM)"
)
}
juv_df <- extract_emm(as.data.frame(emm_juv), "Juveniles")
ad_df <- extract_emm(as.data.frame(emm_ad), "Adults")
lsm_long <- dplyr::bind_rows(juv_df, ad_df)
both <- bind_rows(sb_long %>% select(EVENT_YEAR, Group, mean, lower, upper, Source),
lsm_long)
cols <- c("Survey-balanced" = "#1f77b4", "LS-mean (GLMM)" = "#e41a1c")
fills <- c("Survey-balanced" = "#9ecae1", "LS-mean (GLMM)" = "#fcbba1")
ltys <- c("Survey-balanced" = "solid", "LS-mean (GLMM)" = "solid")
ggplot(both, aes(EVENT_YEAR, mean, colour = Source, fill = Source, linetype = Source)) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.20, colour = NA) +
geom_line(size = 1) +
geom_point(size = 1.8) +
facet_wrap(~ Group, ncol = 1, scales = "free_y") +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
scale_colour_manual(values = cols) +
scale_fill_manual(values = fills) +
scale_linetype_manual(values = ltys) +
labs(
title = "Juvenile & adult proportions: survey-balanced vs LS-mean (GLMM)",
x = "Year", y = "Proportion of measured fish", colour = "", fill = "", linetype = ""
) +
theme_minimal(base_size = 13) +
theme(legend.position = "top", strip.text = element_text(face = "bold"))
Thames Fisheries Experiment - angling data